## R Script Outputs ------------------------------------------------------------
## Appendix Figure B.1: Frequency Distribution: String Distances Between Petitioner Firm Names and Orbis Batch-Searched Firm Names


## Instructions ----------------------------------------------------------------
# Step 1: Adjust MAIN_DIR to where README.txt is located
# Step 2: Run entire script


## setup -----------------------------------------------------------------------
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("tidyverse", 
         "RColorBrewer", 
         "viridis", 
         "gridExtra",
         "stringdist")

lapply(pkg, require, character.only = TRUE)

# set main directory
MAIN_DIR <- "~/Dropbox/Research/JOP-h1b-replication"


## load data -------------------------------------------------------------------
load(file = paste(MAIN_DIR, "data-merge-91-17-orbis-excluded.RData", sep = "/"))

# create variable
df.sub <- data.merge %>%
  mutate(pet_name = ifelse(is.na(h1b_pet_name), l1_pet_name, h1b_pet_name)) 

# subset
df.matches <- df.sub %>%
  select(bvd_name, pet_name) %>%
  distinct()

# remove symbols
df.matches <- df.matches %>%
  mutate(bvd_name_clean = str_replace_all(bvd_name, "[[:punct:]]", " "),
         pet_name_clean = str_replace_all(pet_name, "[[:punct:]]", " "))


## compute quantities of interest ----------------------------------------------
dist <- df.matches %>%
  mutate(cosine_dist = stringdist(df.matches$bvd_name_clean, df.matches$pet_name_clean,
                                  method = "cosine"))

# check
n.dist.all <- nrow(dist)

n.dist.0 <- dist %>% 
  filter(cosine_dist == 0) %>%
  nrow()

n.dist.25 <- dist %>% 
  filter(cosine_dist <= 0.25) %>%
  nrow()

n.dist.50 <- dist %>% 
  filter(cosine_dist <= 0.5) %>%
  nrow()

n.dist.50.100 <- dist %>% 
  filter(cosine_dist > 0.5 & cosine_dist < 1) %>%
  nrow()

# exact match share
n.dist.0 / n.dist.all * 100

# cosine dist <= 0.25
n.dist.25 / n.dist.all * 100

# cosine dist <= 0.50
n.dist.50 / n.dist.all * 100

# cosine dist > 0.50
n.dist.50.100 / n.dist.all * 100


## Figure B.1: Frequency Distribution: String Distances Between Petitioner Firm Names and Orbis Batch-Searched Firm Names ----
# set parameter
axis.title.size <- 16
options(scipen = 100, digits = 4)

# plot
p.distr <- ggplot(dist, aes(x = cosine_dist, after_stat(count))) + 
  geom_histogram(binwidth = 0.01, 
                 fill = "black",
                 position = "identity", 
                 alpha = 1) + 
  scale_x_continuous("Cosine Distance") +
  scale_y_continuous("Frequency\n",
                     breaks = c(0, 100000, 200000, 300000),
                     labels = c(0, 100000, 200000, 300000),
                     limits = c(0, 300000)) +
  theme_bw() +
  theme(plot.title = element_text(size = axis.title.size,
                                  face = "bold",
                                  margin = margin(0, 0, 10, 0),
                                  hjust = 0.5),
        axis.title.y = element_text(size = axis.title.size,
                                    margin = margin(0, 10, 0, 0)),
        axis.title.x = element_text(size = axis.title.size,
                                    margin = margin(10, 0, 0, 0)),
        axis.text = element_text(size = axis.title.size - 2),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = axis.title.size, 
                                  face = "bold"),
        legend.position = "none") 

# save
pdf(paste(MAIN_DIR, "Appendix-Figure-B1.pdf", sep = "/"), 
    width = 5, height = 4)
print(p.distr)
dev.off()
